{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_json(file_path):\n",
    "    data = []\n",
    "    with open(file_path, \"r\") as f:\n",
    "        for json_str in f:  # f is a file object\n",
    "            data.append(json.loads(json_str))\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "json_file_path = \"/localscratch/yzhuang43/ra-llm/ToolQA/data/raw_data/dblp/dblp_v14.json\"\n",
    "with open(json_file_path, 'r') as j:\n",
    "    contents = json.loads(j.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5259858\n",
      "{'id': '53e99784b7602d9701f3ffdd', 'title': 'Flow.', 'doi': '10.1145/1280120.1280225', 'issue': '10', 'keywords': [], 'lang': 'en', 'venue': {'raw': 'SIGGRAPH Art Gallery'}, 'year': 2006, 'n_citation': 15, 'page_start': '', 'page_end': '', 'volume': '', 'issn': '', 'isbn': '', 'url': [], 'abstract': '', 'authors': [{'id': '53f43776dabfaee0d9b6e75b', 'name': 'Masa Inakage', 'org': ''}], 'doc_type': 'Conference'}\n"
     ]
    }
   ],
   "source": [
    "print(len(contents))\n",
    "print(contents[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.DataFrame(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.where(data.notnull(), \"[BLANK]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5259858\n",
      "3641544\n",
      "553320\n",
      "553320\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "import tqdm\n",
    "print(len(data))\n",
    "sub_data = data.loc[(data[\"references\"] != \"[BLANK]\")]\n",
    "print(len(sub_data))\n",
    "sub_data = data.loc[(data[\"year\"] > 2020)]\n",
    "print(len(sub_data))\n",
    "sub_data = sub_data.sort_values(by=[\"n_citation\"], ascending=True)\n",
    "sub_data_paper = sub_data.loc[(sub_data[\"references\"] != \"[BLANK]\")]\n",
    "print(len(sub_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['id', 'title', 'doi', 'issue', 'keywords', 'lang', 'venue', 'year', 'n_citation', 'page_start', 'page_end', 'volume', 'issn', 'isbn', 'url', 'abstract', 'authors', 'doc_type'])\n",
      "id                                           623b1afb5aee126c0fdf5169\n",
      "title                                 Game Awareness: A Questionnaire\n",
      "doi                                                 10.3390/g12040090\n",
      "issue                                                               4\n",
      "keywords            [game awareness,  game elements,  assessment, ...\n",
      "lang                                                               en\n",
      "venue                                                {'raw': 'GAMES'}\n",
      "year                                                             2021\n",
      "n_citation                                                          0\n",
      "page_start                                                         90\n",
      "page_end                                                             \n",
      "volume                                                             12\n",
      "issn                                                                 \n",
      "isbn                                                                 \n",
      "url                 [db/journals/games/games12.html#Kostelic21, ht...\n",
      "abstract            This paper deals with one of the possible meth...\n",
      "authors             [{'id': '', 'name': 'Katarina Kostelic', 'org'...\n",
      "doc_type                                                      Journal\n",
      "references                                                    [BLANK]\n",
      "fos                                                           [BLANK]\n",
      "indexed_abstract                                              [BLANK]\n",
      "v12_id                                                        [BLANK]\n",
      "v12_authors                                                   [BLANK]\n",
      "Name: 5049979, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(contents[0].keys())\n",
    "print(sub_data.iloc[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "An Image is Worth 16x16 Words: Transformers for Image Recognition at  Scale\n",
      "[{'id': '53f430eedabfaee4dc746662', 'name': 'Alexey Dosovitskiy', 'org': 'Google'}, {'id': '53f439efdabfaee2a1d07241', 'name': 'Lucas Beyer', 'org': 'Google'}, {'id': '53f42e76dabfaee4dc72775e', 'name': 'Alexander Kolesnikov', 'org': 'Google'}, {'id': '53f46e13dabfaedf43663cbb', 'name': 'Dirk Weissenborn', 'org': 'Google'}, {'id': '53f43af4dabfaee2a1d12a3f', 'name': 'Xiaohua Zhai', 'org': 'Google'}, {'id': '562c84bd45cedb3398c4b92c', 'name': 'Thomas Unterthiner', 'org': 'Johannes Kepler University of Linz'}, {'id': '562cd89d45cedb3398ce05f8', 'name': 'Mostafa Dehghani', 'org': 'University of Amsterdam'}, {'id': '562ea42245ce1e5967e6d6b2', 'name': 'Matthias Minderer', 'org': 'Google'}, {'id': '53f45a50dabfaefedbb5e7d0', 'name': 'Georg Heigold', 'org': 'German Research Centre for Artificial Intelligence'}, {'id': '5628e21545ce1e59660ab6a8', 'name': 'Sylvain Gelly', 'org': 'Google'}, {'id': '53f44a79dabfaeee22a07a58', 'name': 'Jakob Uszkoreit', 'org': 'Google'}, {'id': '53f37a4bdabfae4b349e0122', 'name': 'Neil Houlsby', 'org': 'Google'}]\n",
      "2021\n",
      "{'raw': 'ICLR'}\n",
      "10851\n",
      "[]\n",
      "Conference\n",
      "[BLANK]\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "row = sub_data.iloc[-1]\n",
    "print(row[\"title\"])\n",
    "print(row[\"authors\"])\n",
    "print(row[\"year\"])\n",
    "print(row[\"venue\"])\n",
    "print(row[\"n_citation\"])\n",
    "print(row[\"keywords\"])\n",
    "print(row[\"doc_type\"])\n",
    "print(row[\"references\"])\n",
    "print(row[\"page_start\"])\n",
    "print(row[\"page_end\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "553320\n",
      "355368\n"
     ]
    }
   ],
   "source": [
    "# create mapping between papers and their ids and authors and their ids\n",
    "id2title_dict = {}\n",
    "id2author_dict = {}\n",
    "title2id_dict = {}\n",
    "author2id_dict = {}\n",
    "# data_dict = {}\n",
    "for i in range(len(sub_data)):\n",
    "    row = sub_data.iloc[i]\n",
    "    # data_dict[row[\"id\"]] = row\n",
    "    id2title_dict[row[\"id\"]] = row[\"title\"]\n",
    "    title2id_dict[row[\"title\"]] = row[\"id\"]\n",
    "    for j in range(len(row[\"authors\"])):\n",
    "        author_id = row[\"authors\"][j][\"id\"]\n",
    "        name = row[\"authors\"][j][\"name\"]\n",
    "        if author_id not in id2author_dict and author_id != '' and author_id != None and author_id != ' ':\n",
    "            id2author_dict[author_id] = name\n",
    "            author2id_dict[name] = author_id\n",
    "print(len(id2title_dict))\n",
    "print(len(id2author_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "63b7dbdddf1d768b3c17882c\n"
     ]
    }
   ],
   "source": [
    "author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "print(author_id1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dict = {}\n",
    "for i in range(len(data)):\n",
    "    row = data.iloc[i]\n",
    "    data_dict[row[\"id\"]] = row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save dict to file\n",
    "with open(\"/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/title2id_dict.pkl\", \"wb\") as f:\n",
    "    pickle.dump(title2id_dict, f)\n",
    "with open(\"/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/author2id_dict.pkl\", \"wb\") as f:\n",
    "    pickle.dump(author2id_dict, f)\n",
    "with open(\"/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/id2title_dict.pkl\", \"wb\") as f:\n",
    "    pickle.dump(id2title_dict, f)\n",
    "with open(\"/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/id2author_dict.pkl\", \"wb\") as f:\n",
    "    pickle.dump(id2author_dict, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chao Zhang\n",
      "Weihong Lin\n"
     ]
    }
   ],
   "source": [
    "author2id_dict[\"Chao Zhang\"]\n",
    "print(id2author_dict[\"632546fd128293c81ede38cc\"])\n",
    "print(id2author_dict[\"6373427bec88d95668d652c8\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "553320\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 553320/553320 [01:36<00:00, 5734.53it/s]\n"
     ]
    }
   ],
   "source": [
    "# create paper network and author network\n",
    "paper_net = nx.DiGraph()\n",
    "author_net = nx.Graph()\n",
    "print(len(sub_data))\n",
    "id2node = {}\n",
    "node2id = {}\n",
    "nodeid = 0\n",
    "for j in tqdm.tqdm(range(len(sub_data))):\n",
    "    context = sub_data.iloc[j]\n",
    "    id = context[\"id\"]\n",
    "    paper_net.add_node(id, title=context[\"title\"], authors = context[\"authors\"], year=context[\"year\"], venue=context[\"venue\"], n_citation=context[\"n_citation\"], keywords=context[\"keywords\"], doc_type=context[\"doc_type\"], page_start=context[\"page_start\"], page_end=context[\"page_end\"])\n",
    "    references = context[\"references\"]\n",
    "    if references != \"[BLANK]\":\n",
    "        for i in range(len(references)):\n",
    "            if not references[i] in paper_net.nodes():\n",
    "                if references[i] in data_dict:\n",
    "                    new_context = data_dict[references[i]]\n",
    "                    paper_net.add_node(references[i], title=new_context[\"title\"], authors=new_context[\"authors\"], year=new_context[\"year\"], venue=new_context[\"venue\"], n_citation=new_context[\"n_citation\"], keywords=new_context[\"keywords\"], doc_type=new_context[\"doc_type\"], page_start=new_context[\"page_start\"], page_end=new_context[\"page_end\"])\n",
    "                else:\n",
    "                    paper_net.add_node(references[i])\n",
    "            paper_net.add_edge(id, references[i])\n",
    "            # paper_net.add_edge(references[i], id)\n",
    "    authors = context[\"authors\"]\n",
    "    author_list = []\n",
    "    for i in range(len(authors)):\n",
    "        if authors[i][\"id\"] not in author_net.nodes() and authors[i][\"id\"] != '':\n",
    "            author_net.add_node(authors[i][\"id\"], org=authors[i][\"org\"])\n",
    "        if authors[i][\"id\"] != '':\n",
    "            author_list.append(authors[i][\"id\"])\n",
    "    author_list = list(set(author_list))\n",
    "    for i in range(len(author_list)):\n",
    "        for k in range(i + 1, len(author_list)):\n",
    "            if not (author_list[i], author_list[k]) in author_net.edges():\n",
    "                author_net.add_edge(author_list[i], author_list[k], weight=1, papers=[id], n_citation=[context[\"n_citation\"]])\n",
    "                author_net.add_edge(author_list[k], author_list[i], weight=1, papers=[id], n_citation=[context[\"n_citation\"]])\n",
    "            else:\n",
    "                author_net[author_list[i]][author_list[k]][\"weight\"] += 1\n",
    "                author_net[author_list[k]][author_list[i]][\"weight\"] += 1\n",
    "                author_net[author_list[i]][author_list[k]][\"papers\"].append(id)\n",
    "                author_net[author_list[k]][author_list[i]][\"papers\"].append(id)\n",
    "                author_net[author_list[i]][author_list[k]][\"n_citation\"].append(context[\"n_citation\"])\n",
    "                author_net[author_list[k]][author_list[i]][\"n_citation\"].append(context[\"n_citation\"])\n",
    "\n",
    "\n",
    "    # if j % 10000 == 0:\n",
    "    #     print(j)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/paper_net.pkl', 'wb') as f:\n",
    "    pickle.dump(paper_net, f)\n",
    "\n",
    "with open('/localscratch/yzhuang43/ra-llm/ToolQA/data/external_corpus/dblp/author_net.pkl', 'wb') as f:\n",
    "    pickle.dump(author_net, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
